Prosper is a peer-to-peer lending marketplace. At the time of writing, Prosper operates in the US and has issued loans worth over USD 22 billion to c. 1.4 million borrowers. Borrowers apply online for a fixed-rate, fixed-term loan between USD 2,000.00 and USD 40,000.00. Individuals, like you and I, and institutions, Sequoia Capital for example, invest in said loans. Prosper handles all loan servicing on behalf of the borrowers and investors.
The data set at hand has 113,937 observations (loans, if you like) and 81 variables; detailed information on the variables can be found here. Observations relate to the years 2005 to 2014, inclusive.
#upgrade `pandas`
!pip install --upgrade pandas
#import all packages and set plots to be embedded inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings(action='once')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from random import randint
from os import path
from contextlib import suppress
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
%matplotlib inline
automate, as much as possible, the process of creating visualisations
why?
how?
#template no. 1
#default blue
default_blue = sns.color_palette('tab10')[0]
#default orange, if required
default_orange = sns.color_palette('tab10')[1]
#default palette
default_palette = sns.color_palette('tab10')
#template no. 2
'''
simple function to create `Figure` object
using matplotlib that contains an x-lab,
y-lab and title.
"Father Figure", if you like :)
3 params, all type `str`:
x_lab, y_lab and title
Please make sure all args passed to the
function are type `str`
return: None
'''
def create_fig(x_lab: str, y_lab: str, title: str):
"""create_fig function"""
try:
#fig size
plt.figure(figsize=(10, 6.18), dpi=216, frameon=False, clear=True)
#x-axis name
plt.xlabel(x_lab)
#y-axis name
plt.ylabel(y_lab)
#title
plt.title(title)
except ModuleNotFoundError:
print(f'Please `import matplotlib.pyplot as plt` and try again')
except:
print(f'Failed to create template')
raise
#template no. 3
'''
simple function to create `Figure` object
using matplotlib for sub-plots.
"Father Figure" for subplots :)
2 params, type int; the number of sub-plots.
@n_row -> #rows
@n_col -> #cols
return: fig and ax objects
'''
def create_sub(n_row: int=1, n_col: int=1):
"""create_sub function"""
try:
#fig, ax and figsize
fig, ax = plt.subplots(n_row, n_col, figsize=(10,6.18), dpi=216)
fig.tight_layout(pad=10.0)
return fig, ax
except ModuleNotFoundError:
print(f'Please `import matplotlib.pyplot as plt` and try again')
except:
print(f'Failed to create template')
raise
#see if a df exists
'''
a simple function to see if a df exists
takes in 1 param: name of the df
Please do not pass the arg as a string
return: None
'''
def confirm_df_exists(df):
""" function confirm_exists """
if not df.empty:
print(f'This dataframe exists')
return
print(f'This dataframe does not exist')
#group data and find mean
'''
Function to calculate mean of
grouped data
Takes in 2 args: var_1 and var_2
Both args must be columns of a pandas DF
return: average of grouped data
'''
def group_avg(var_1, var_2):
""" function group_avg"""
if var_1 and var_2:
avg_gp_data = df.groupby(var_1)[var_2].mean()
return avg_gp_data
else:
print(f'Check that both arguments are columns of a pandas DF and try again')
#group data and find sum
'''
Function to calculate sum of
grouped data
Takes in 2 args: var_1 and var_2
Both args must be columns of a pandas DF
return: sum of grouped data
'''
def group_sum(var_1, var_2):
""" function group_sum"""
if var_1 and var_2:
sum_gp_data = df.groupby(var_1)[var_2].sum()
return sum_gp_data
else:
print(f'Check that both arguments are columns of s pandas DF and try again')
'''
function to create and sort a DF
3 params:
@x: pandas series #1
@y: pandas series #2
@ascending: bool, default T
return: pandas DF
'''
def custom_sort_df(x, y, ascending:bool=True):
"""function custom_sort_df"""
try:
dd = pd.DataFrame({k:v for (k, v) in zip(x, y)}, index=['Count']).transpose()
dd.sort_values('Count', inplace=True, ascending=ascending)
dd.reset_index(inplace=True)
return dd
except:
print(f'Check that x and y are pandas series and try again')
raise
'''
function to confirm that a file has been created
and/or exists in the current dir
one param: type `str`:
@file_name -> name of the file (include extension, please)
return: none
'''
def confirm_file_exists(file_name: str):
"""function confirm_file_exists"""
if path.exists(file_name):
print(f'File exists')
else:
print(f'Something went wrong. Investigate')
#load the data set
df = pd.read_csv('prosperLoanData.csv', sep=',')
confirm_df_exists(df)
df.shape
df.duplicated().value_counts()
df.info();
df.sample(randint(5, 15))
boolfloatintstrLoanOriginationDate is of type strOccupation and IncomeRange are of type strdf have missing or null valuesGroupKeyLoanFirstDefaultedCycleNumberGroupKey represents observations that fall under a criteria; the number of said groups cannot be reasonably expected to be 113,937df has no duplicated observationsdf have 1 value per variable LoanOriginalAmount, LoanOriginationDate and BorrowerRate will help answer the first questionBorrowerrate, Term, BorrowerAPR, StatedMonthlyIncome and EmploymentStatusDuration among others will help answer the next two questionsBorrowerrate, PercentFunded, and Investors will help answer the last questionLoanOriginationDate to datetimeOccupation and IncomeRange to category#create a copy of the df
df = df.copy()
#drop variables whose missing values are above the threshold
df.drop(columns=df.columns[df.eq(0).mean()>0.85], inplace=True)
df.shape
#drop observations whose missing values are above the threshold
df.drop(index=df.index[df.eq(0).mean()>0.9], inplace=True)
#save cleaned data set as a `.csv` file then load it under the variable `df`
#save data set as a `.csv`
df.to_csv('prosper_loans_data.csv', index=False, encoding='utf-8', sep=',')
#confirm that the`.csv` file exists
confirm_file_exists('prosper_loans_data.csv')
#load the clean df
df = pd.read_csv('prosper_loans_data.csv', sep=',')
confirm_df_exists(df)
#change the type of variable `LoanOriginationDate` to `datetime`
df.LoanOriginationDate = pd.to_datetime(df.LoanOriginationDate)
df.LoanOriginationDate.sample(randint(3, 5))
#change the type of variables `Occupation` and `IncomeRange` to `category`
df.Occupation = df.Occupation.astype("category")
df.IncomeRange = df.IncomeRange.astype("category")
#handling categorical variables for ease of plotting
df_0 = {'LoanStatus' : ['Current', 'Completed', 'FinalPaymentInProgress', 'Past Due (1-15 days)', 'Past Due (16-30 days)',
'Past Due (31-60 days)', 'Past Due (61-90 days)', 'Past Due (91-120 days)', 'Past Due (>120 days)',
'Defaulted', 'ChargedOff', 'Cancelled'],
'IncomeRange' : ['$0', '$1-24,999', '$25,000-49,999', '$50,000-74,999', '$75,000-99,999', '$100,000+', 'Not displayed',
'Not employed'],
'ProsperRating (Alpha)' : ['AA', 'A', 'B', 'C', 'D', 'E', 'HR'],
'IsBorrowerHomeowner' : [True, False],
'IncomeVerifiable' : [False, True]
}
for i in df_0:
ordered_df = pd.api.types.CategoricalDtype(ordered= True, categories= df_0[i])
df[i] = df[i].astype(ordered_df)
df.info()
sweetviz¶# pre_eda_rpt = sv.analyze(df)
# pre_eda_rpt.show_html('./Summary_EDA_Report.html')
##does not work; notebook does not support sweetviz :(
pd.DataFrame(df.LoanOriginalAmount.describe())
Prosper could have revised its loan policyProsper could have increased its product offeringa histogram of the above...
#create a hist
create_fig('Amount borrowed (USD)','Frequency (Count)','Frequency distribution of amount(s) borrowed')
plt.hist(df.LoanOriginalAmount, bins=50, color=default_blue);
pd.DataFrame(df.BorrowerRate.describe())
Prosper offer interest-free loans?a histogram of the above supports the findings...
#create hist
create_fig('Rate(s) of borrowing (%)', 'Frequency (Count)', 'Frequency distribution of borrower rates')
plt.hist(df.BorrowerRate, bins=100, color=default_blue);
#sum of loans by year
sum_loan_time = group_sum('LoanOriginationDate','LoanNumber')
#number of loans issued
y = [sum_loan_time.values[i] for i in range(len(sum_loan_time))]
#time (years)
x = [sum_loan_time.index[i].year for i in range(len(sum_loan_time))]
#create plot
create_fig('Time (Years)', '#Loans issued (tens of millions)', 'Number of loans issued YoY')
plt.bar(x, y, tick_label=x, color=default_blue)
plt.yticks(np.arange(0, 65000000, step=10000000));
#view basic descriptive stats of `sum_loan_time`
pd.DataFrame(sum_loan_time).describe()
#time series plot for `sum_loan_plot` by the day
create_fig('Time (Days)', '#Loans issued (tens of millions)', 'Number of loans issued over time')
sum_loan_time.plot(kind='line',color=default_blue)
plt.xlabel('Time (delta = 1 day)');
the number of loans disbursed changes with time
there appears to be an exponential trend
findings from summary statistics
What, really, happens:
#visualise said timelines
create_sub(2,1)
plt.suptitle('Time series plots for outlier data')
#sub-plot #1: loans issued pre-2007
plt.subplot(2, 1, 1)
sum_loan_time.plot(kind='line',color=default_blue, xlim=['2005-12-31','2008-01-01'], title='#Loans issued (2005 - 2007)', fontsize=9)
plt.xlabel('Time (delta = 1 day)')
plt.ylabel('#Loans issued');
#sub-plot #2: loans issued during Q4 2008 to Q3 2009
plt.subplot(2,1,2)
sum_loan_time.plot(kind='line',color=default_blue, xlim=['2008-09-01','2009-09-30'], title='#Loans issued (Q4 2008 - Q3 2009)', fontsize=9)
plt.xlabel('Time (delta = 1 day)')
plt.xticks(['2008-09', '2008-12', '2009-03', '2009-06', '2009-09'])
plt.ylabel('#Loans issued');
LoanOriginalAmount:LoanOriginationDate and LoanNumber:BorrowerRate:LoanOriginationDate and LoanNumber were processed to show the number of loans issued over timeLoanOriginationDate was converted to type datetime#1. occupation: top ten
y = df.Occupation.value_counts().values[:10]
x = df.Occupation.value_counts().index[:10]
#sort top 10
dd = custom_sort_df(x, y)
#create viz
create_fig('Count', 'Occupation', 'Top ten occupations of Prosper\'s customers' )
plt.hlines(y=dd.index, xmin=0, xmax=dd.Count, color=default_blue, linewidth=20)
plt.yticks(np.arange(len(x)), dd.iloc[:,0]);
Categories Other and Professional
Other is ambiguous; not quite certain what it represents Professional, according to the US bureau of labour statistics, appears to be a collective termWhat happens when Other and Professional are excluded?
#2. occupation: top ten sans `Others` and `Professional`
y = df.Occupation.value_counts().values[2:12]
x = df.Occupation.value_counts().index[2:12]
#sort top 10
dd = custom_sort_df(x, y)
#create viz
create_fig('Count', 'Occupation', 'Top ten occupations of Prosper\'s customers (sans \'Other\' and \'Professional\')' )
plt.hlines(y=dd.index, xmin=0, xmax=dd.Count, color=default_blue, linewidth=20)
plt.yticks(np.arange(len(x)), dd.iloc[:,0]);
#3. occupation: bottom 10
y = df.Occupation.value_counts().values[-10:]
x = df.Occupation.value_counts().index[-10:]
#sort bottom 10
dd = custom_sort_df(x, y)
#create viz
create_fig('Count', 'Occupation', 'Bottom ten occupations of Prosper\'s customers' )
plt.hlines(y=dd.index, xmin=0, xmax=dd.Count, color=default_blue, linewidth=20)
plt.yticks(np.arange(len(x)), dd.iloc[:,0]);
#show top ten sans 2 unclear categories
y = df.Occupation.value_counts().values[2:12]
x = df.Occupation.value_counts().index[2:12]
top_ten = pd.DataFrame({k:v for (k, v) in zip(x, y)}, index=['Count']).transpose()
top_ten
#show bottom ten
y = df.Occupation.value_counts().values[-10:]
x = df.Occupation.value_counts().index[-10:]
bottom_ten = pd.DataFrame({k:v for (k, v) in zip(x, y)}, index=['Count']).transpose()
bottom_ten
#4. income Levels
y = df.IncomeRange.value_counts().values[:]
x = df.IncomeRange.value_counts().index[:]
#sort income levels
dd = custom_sort_df(x, y)
#create viz
create_fig('Count', 'Income level (USD, yearly)', 'Prosper\'s customers by income level')
plt.hlines(y=dd.index, xmin=0, xmax=dd.Count, color=default_blue, linewidth=20)
plt.yticks(np.arange(len(x)), dd.iloc[:, 0]);
#show income levels
pd.DataFrame({k:v for (k, v) in zip(x,y)}, index=['Count']).transpose()
$25,000-49,999$50,000-74,999$100,000+$75,000-99,999Not displayed is ambiguous, as such, cannot be interpretedNot employed. Not sure if an employee can earn USD 0.00 p.a.; this article and this one have diverging views#5. home ownership status
#home ownership: values to plot
y = df.IsBorrowerHomeowner.value_counts().values[:]
x = df.IsBorrowerHomeowner.value_counts().index[:]
#create viz
create_fig('Is borrower a home-owner?', 'Count', 'Home ownership status of Prosper\'s customers')
plt.bar(x, y, color=default_blue)
plt.xticks(np.arange(2), ['No', 'Yes']);
#show home ownership status
pd.DataFrame({k:v for (k, v) in zip(x, y)}, index=['Count']).transpose()
#6. location by state
#state top-ten: values to plot
y = df.BorrowerState.value_counts().values[:10]
x = df.BorrowerState.value_counts().index[:10]
#sort states
dd = custom_sort_df(x, y, ascending=False)
#create viz
create_fig('Place of residence (State)', 'Count', 'Prosper\'s customers by location (top 10 states)')
plt.vlines(x=dd.index, ymin=0, ymax=dd.Count, color=default_blue, linewidth=20)
plt.xticks(np.arange(len(x)), dd.iloc[:, 0]);
#state bottom ten: values to plot
y = df.BorrowerState.value_counts().values[-10:]
x = df.BorrowerState.value_counts().index[-10:]
#sort states
dd = custom_sort_df(x, y, ascending=False)
#create viz
create_fig('Place of residence (State)', 'Count', 'Prosper\'s customers by location (bottom 10 states)')
plt.vlines(x=dd.index, ymin=0, ymax=dd.Count, color=default_blue, linewidth=20)
plt.xticks(np.arange(len(x)), dd.iloc[:, 0]);
#show top ten
y = df.BorrowerState.value_counts().values[:10]
x = df.BorrowerState.value_counts().index[:10]
pd.DataFrame({k:v for (k, v) in zip(x, y)}, index=['Count']).transpose()
#show bottom ten
y = df.BorrowerState.value_counts().values[-10:]
x = df.BorrowerState.value_counts().index[-10:]
pd.DataFrame({k:v for (k, v) in zip(x, y)}, index=['Count']).transpose()
#7 employment status
#employment status: values to plot
y = df.EmploymentStatus.value_counts().values[:]
x = df.EmploymentStatus.value_counts().index[:]
#sort income levels
dd = custom_sort_df(x, y)
#create viz
create_fig('Count', 'Employment status', 'Employment status of Prosper\'s customers')
plt.hlines(y=dd.index, xmin=0, xmax=dd.Count, color=default_blue, linewidth=20)
plt.yticks(np.arange(len(x)), dd.iloc[:, 0]);
Employed is ambiguous; does it mean:
Full-time, Part-time or Self-employedIs it to be treated as its own category?
EmploymentStatusBorrowerRate?¶BorrowerRate was discussed in Question #2BorrowerRate against:#1. term
#see correlation between `Term` and `BorrowerRate`
fig, ax = create_sub()
sns.stripplot(df.Term, df.BorrowerRate, color=default_blue)
#title
plt.title('How the term of a loan correlates to the rate')
#labels
plt.xlabel('Term (months)')
plt.ylabel('Rate (% pa, nominal)');
#correlation matrix of `Term` and `BorrowerRate`
corr_df = pd.DataFrame([df.Term, df.BorrowerRate]).transpose()
corr_df.corr()
#jittered stripplot for `BorrowerRate` against `Term`
fig, ax = create_sub()
sns.stripplot(df.Term, df.BorrowerRate, jitter=0.25, size=3, ax=ax, linewidth=.5)
#title
plt.title('How the term of a loan correlates to the rate (jittered)')
#labels
plt.xlabel('Term (months)')
plt.ylabel('Rate (% pa, nominal)');
Term = 12 months are not as dense (tightly packed, if you like) as, say, Term = 36 monthsTerm = 36 monthsTerm and BorrowerRate#2. APR
#see correlation between `BorrowerAPR` and `BorrowerRate`
fig, ax = create_sub()
sns.stripplot(df.BorrowerAPR, df.BorrowerRate, color=default_blue)
#title
plt.title('How the APR of a loan correlates to the rate')
#labels
plt.xlabel('Rate (% pa, effective)')
plt.ylabel('Rate (% pa, nominal)')
#supress xticks; only the trend is required
with suppress(TypeError):
plt.xticks(np.arange(0, 0, 1), None);
#correlation matrix of `BorrowerAPR` and `BorrowerRate`
corr_df = pd.DataFrame([df.BorrowerAPR, df.BorrowerRate]).transpose()
corr_df.corr()
BorrowerAPR and BorrowerRatewhat happens when data points are jittered?
#jittered stripplot for `BorrowerRate` against `BorrowerAPR`
fig, ax = create_sub()
sns.stripplot(df.BorrowerAPR, df.BorrowerRate, jitter=1.5, size=3, ax=ax, linewidth=.5)
#title
plt.title('How the APR of a loan correlates to the rate (jittered)')
#labels
plt.xlabel('Rate (% pa, effective)')
plt.ylabel('Rate (% pa, nominal)')
#supress xticks; only the trend is required
with suppress(TypeError):
plt.xticks(np.arange(0, 0, 1), None);
#3. stated monthly income
#see correlation between `StatedMonthlyIncome` and `BorrowerRate`
fig, ax = create_sub()
sns.stripplot(df.StatedMonthlyIncome, df.BorrowerRate, color=default_blue)
#title
plt.title('How the stated income of a borrower correlates to the rate')
#labels
plt.xlabel('Stated income (USD, monthly)')
plt.ylabel('Rate (% pa, nominal)')
#supress xticks; only the trend is required
with suppress(TypeError):
plt.xticks(np.arange(0, 0, 1), None);
#correlation matrix of `StatedMonthlyIncome` and `BorrowerRate`
corr_df = pd.DataFrame([df.StatedMonthlyIncome, df.BorrowerRate]).transpose()
corr_df.corr()
BorrowerRate is clustered at the range 0.05% to 0.35% for all levels of StatedMonthlyIncomeStatedMonthlyIncomewhat happens whan data points are jittered?
#jittered stripplot for `BorrowerRate` against `StatedMonthlyIncome`
fig, ax = create_sub()
sns.stripplot(df.StatedMonthlyIncome, df.BorrowerRate, jitter=0.5, size=3, ax=ax, linewidth=.5)
#title
plt.title('How the stated income of a borrower correlates to the rate (jittered)')
#labels
plt.xlabel('Stated income (USD, monthly)')
plt.ylabel('Rate (% pa, nominal)')
#supress xticks; only the trend is required
with suppress(TypeError):
plt.xticks(np.arange(0, 0, 1), None);
#4. employment duration
#see correlation between `EmploymentStatusDuration` and `BorrowerRate`
fig, ax = create_sub()
sns.stripplot(df.EmploymentStatusDuration, df.BorrowerRate, color=default_blue)
#title
plt.title('How the employment duration of a borrower correlates to the rate')
#labels
plt.xlabel('Employment status duration (days)')
plt.ylabel('Rate (% pa, nominal)');
#supress xticks; only the trend is required
with suppress(TypeError):
plt.xticks(np.arange(0, 0, 1), None);
#correlation matrix of `EmploymentDuration` and `BorrowerRate`
corr_df = pd.DataFrame([df.EmploymentStatusDuration, df.BorrowerRate]).transpose()
corr_df.corr()
BorrowerRate is clustered at the range 0.05% to 0.35% for most levels of EmploymentStatusDurationwhat happens when data points are jittered?
#jittered stripplot for `BorrowerRate` against `EmploymentDuration`
fig, ax = create_sub()
sns.stripplot(df.EmploymentStatusDuration, df.BorrowerRate, jitter=0.25, size=3, ax=ax, linewidth=.5)
#title
plt.title('How the employment duration of a borrower correlates to the rate (jittered)')
#labels
plt.xlabel('Employment status duration (days)')
plt.ylabel('Rate (% pa, nominal)');
#supress xticks; only the trend is required
with suppress(TypeError):
plt.xticks(np.arange(0, 0, 1), None);
#see correlation between `BorrowerRate` and `PercentFunded`
fig, ax = create_sub()
sns.stripplot(df.BorrowerRate, df.PercentFunded, color=default_blue)
#title
plt.title('How the nominal rate of a loan correlates to the proportion of loan funded')
#labels
plt.xlabel('Rate (% pa, nominal)')
plt.ylabel('Proportion of loan funded (per unit)')
#supress xticks; only the trend is required
with suppress(TypeError):
plt.xticks(np.arange(0, 0, 1), None);
#correlation matrix of `BorrowerRate` and `PercentFunded`
corr_df = pd.DataFrame([df.BorrowerRate, df.PercentFunded]).transpose()
corr_df.corr()
PercentFunded is clustered at the range 0.7 (70% funding) to 1 (100% funding) for all levels of BorrowerRatewhat happens when data points are jittered?
#jittered stripplot for `BorrowerRate` against `PercentFunded`
fig, ax = create_sub()
sns.stripplot(df.BorrowerRate, df.PercentFunded, jitter=0.25, size=3, ax=ax, linewidth=.5)
#title
plt.title('How the nominal rate of a loan correlates to the proportion of loan funded (jittered)')
#labels
plt.xlabel('Rate (% pa, nominal)')
plt.ylabel('Proportion of loan funded (per unit)')
#supress xticks; only the trend is required
with suppress(TypeError):
plt.xticks(np.arange(0, 0, 1), None);
#see correlation between `BorrowerRate` and `Investors`
fig, ax = create_sub()
sns.stripplot(df.BorrowerRate, df.Investors, color=default_blue)
#title
plt.title('How the nominal rate of a loan correlates to the number of investors in said loan')
#labels
plt.xlabel('Rate (% pa, nominal)')
plt.ylabel('Number of Investors');
#supress xticks; only the trend is required
with suppress(TypeError):
plt.xticks(np.arange(0, 0, 1), None);
#correlation matrix of `BorrowerRate` and `Investors`
corr_df = pd.DataFrame([df.BorrowerRate, df.Investors]).transpose()
corr_df.corr()
Investors is clustered at the range 0 to c. 400 for all levels of BorrowerRateInvestors = 0what happens when data points are jittered?
#jittered stripplot for `BorrowerRate` against `Investors`
fig, ax = create_sub()
sns.stripplot(df.BorrowerRate, df.Investors, jitter=0.25, size=3, ax=ax, linewidth=.5)
#title
plt.title('How the nominal rate of a loan correlates to the number of investors in said loan (jittered)')
#labels
plt.xlabel('Rate (% pa, nominal)')
plt.ylabel('Number of Investors');
#supress xticks; only the trend is required
with suppress(TypeError):
plt.xticks(np.arange(0, 0, 1), None);
BorrowerRate vs Term¶BorrowerRate vs BorrowerAPR¶BorrowerAPR and BorrowerRateBorrowerRate vs StatedMonthlyIncome¶BorrowerRate vs EmploymentStatusDuration¶PercentFunded vs BorrowerRate¶PercentFunded is clustered at the range 0.7 (70% funding) to 1 (100% funding) for all levels of BorrowerRatePercentFunded vs Investors¶Investors is clustered at the range 0 to c. 400 for all levels of BorrowerRateBorrowerRate vs Term¶Term = 36 monthsBorrowerRate vs BorrowerAPR¶BorrowerRate vs StatedMonthlyIncome¶BorrowerRate vs EmploymentStatusDuration¶PercentFunded vs BorrowerRate¶PercentFunded vs Investors¶#df to use for viz
corr_df = pd.DataFrame([df.BorrowerRate, df.Term, df.BorrowerAPR, df.StatedMonthlyIncome, df.EmploymentStatusDuration]).transpose()
corr_df = corr_df[pd.notnull(corr_df['EmploymentStatusDuration'])]
corr_df.info()
#sub-plots
fig, ax = create_sub()
#correlation matrix
corr = corr_df.corr()
#plot
with suppress(DeprecationWarning, ImportWarning):
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(255, 32, l=65, center="dark", as_cmap=True), square=True, ax=ax, annot=True, fmt="0.2f", linewidths=.5);
corr
Term and BorrowerRate -> c. 0.02Term and StatedMonthlyIncome -> c. 0.03Term and EmploymentStatusDuration -> c. 0.09StatedMonthlyIncome and EmploymentStatusDuration -> c. 0.07BorrowerRate and StatedMonthlyIncome -> c. -0.09BorrowerRate and EmploymentStatusDuration -> c. -0.02Term and BorrowerAPR -> c. -0.02StatedMonthlyIncome and BorrowerAPR -> c. -0.08EmploymentStatusDuration and BorrowerAPR -> c. -0.01BorrowerAPR and BorrowerRate -> c. 0.99#df to use for viz
corr_df = pd.DataFrame([df.BorrowerRate, df.PercentFunded, df.Investors]).transpose()
corr_df.info()
#sub-plots
fig, ax = create_sub()
#correlation matrix
corr = corr_df.corr()
#plot
with suppress(DeprecationWarning, ImportWarning):
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(255, 32, l=65, center="dark", as_cmap=True), square=True, ax=ax, annot=True, fmt="0.2f", linewidths=.5);
corr
BorrowerRate and PercentFunded -> c. -0.03BorrowerRate and Investors -> c. -0.27PercentFunded and Investors -> c. -0.05BorrowerRate as much as expectedBorrowerRate do not affect each other significantlyInvestorsBorrowerRate and Term -> c. 0.02BorrowerRate and StatedMonthlyIncome -> c. -0.09BorrowerRate and EmploymentStatusDuration -> c. -0.02BorrowerRate and BorrowerAPR -> c. 0.99Investors and BorrowerRate -> c. -0.27Investors and PercentFunded -> c. -0.05from subprocess import call
call(['python', '-m', 'nbconvert', 'Part_1_exploration.html'])